1   package org.apache.lucene.codecs.compressing;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.Closeable;
21  import java.io.IOException;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.Iterator;
25  import java.util.NoSuchElementException;
26  
27  import org.apache.lucene.codecs.CodecUtil;
28  import org.apache.lucene.codecs.TermVectorsReader;
29  import org.apache.lucene.index.CorruptIndexException;
30  import org.apache.lucene.index.DocsAndPositionsEnum;
31  import org.apache.lucene.index.IndexOptions;
32  import org.apache.lucene.index.PostingsEnum;
33  import org.apache.lucene.index.FieldInfo;
34  import org.apache.lucene.index.FieldInfos;
35  import org.apache.lucene.index.Fields;
36  import org.apache.lucene.index.IndexFileNames;
37  import org.apache.lucene.index.SegmentInfo;
38  import org.apache.lucene.index.Terms;
39  import org.apache.lucene.index.TermsEnum;
40  import org.apache.lucene.store.AlreadyClosedException;
41  import org.apache.lucene.store.ByteArrayDataInput;
42  import org.apache.lucene.store.ChecksumIndexInput;
43  import org.apache.lucene.store.Directory;
44  import org.apache.lucene.store.IOContext;
45  import org.apache.lucene.store.IndexInput;
46  import org.apache.lucene.util.Accountable;
47  import org.apache.lucene.util.Accountables;
48  import org.apache.lucene.util.ArrayUtil;
49  import org.apache.lucene.util.BytesRef;
50  import org.apache.lucene.util.IOUtils;
51  import org.apache.lucene.util.LongsRef;
52  import org.apache.lucene.util.packed.BlockPackedReaderIterator;
53  import org.apache.lucene.util.packed.PackedInts;
54  
55  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.CODEC_SFX_DAT;
56  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.CODEC_SFX_IDX;
57  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.FLAGS_BITS;
58  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.OFFSETS;
59  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PACKED_BLOCK_SIZE;
60  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PAYLOADS;
61  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.POSITIONS;
62  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_EXTENSION;
63  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION;
64  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CHUNK_STATS;
65  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CURRENT;
66  import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_START;
67  
68  /**
69   * {@link TermVectorsReader} for {@link CompressingTermVectorsFormat}.
70   * @lucene.experimental
71   */
72  public final class CompressingTermVectorsReader extends TermVectorsReader implements Closeable {
73  
74    private final FieldInfos fieldInfos;
75    final CompressingStoredFieldsIndexReader indexReader;
76    final IndexInput vectorsStream;
77    private final int version;
78    private final int packedIntsVersion;
79    private final CompressionMode compressionMode;
80    private final Decompressor decompressor;
81    private final int chunkSize;
82    private final int numDocs;
83    private boolean closed;
84    private final BlockPackedReaderIterator reader;
85    private final long numChunks; // number of compressed blocks written
86    private final long numDirtyChunks; // number of incomplete compressed blocks written
87    private final long maxPointer; // end of the data section
88  
89    // used by clone
90    private CompressingTermVectorsReader(CompressingTermVectorsReader reader) {
91      this.fieldInfos = reader.fieldInfos;
92      this.vectorsStream = reader.vectorsStream.clone();
93      this.indexReader = reader.indexReader.clone();
94      this.packedIntsVersion = reader.packedIntsVersion;
95      this.compressionMode = reader.compressionMode;
96      this.decompressor = reader.decompressor.clone();
97      this.chunkSize = reader.chunkSize;
98      this.numDocs = reader.numDocs;
99      this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
100     this.version = reader.version;
101     this.numChunks = reader.numChunks;
102     this.numDirtyChunks = reader.numDirtyChunks;
103     this.maxPointer = reader.maxPointer;
104     this.closed = false;
105   }
106 
107   /** Sole constructor. */
108   public CompressingTermVectorsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn,
109       IOContext context, String formatName, CompressionMode compressionMode) throws IOException {
110     this.compressionMode = compressionMode;
111     final String segment = si.name;
112     boolean success = false;
113     fieldInfos = fn;
114     numDocs = si.maxDoc();
115     int version = -1;
116     CompressingStoredFieldsIndexReader indexReader = null;
117     
118     long maxPointer = -1;
119     
120     // Load the index into memory
121     final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION);
122     try (ChecksumIndexInput input = d.openChecksumInput(indexName, context)) {
123       Throwable priorE = null;
124       try {
125         final String codecNameIdx = formatName + CODEC_SFX_IDX;
126         version = CodecUtil.checkIndexHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
127         assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer();
128         indexReader = new CompressingStoredFieldsIndexReader(input, si);
129         maxPointer = input.readVLong(); // the end of the data section
130       } catch (Throwable exception) {
131         priorE = exception;
132       } finally {
133         CodecUtil.checkFooter(input, priorE);
134       }
135     }
136     
137     this.version = version;
138     this.indexReader = indexReader;
139     this.maxPointer = maxPointer;
140 
141     try {
142       // Open the data file and read metadata
143       final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
144       vectorsStream = d.openInput(vectorsStreamFN, context);
145       final String codecNameDat = formatName + CODEC_SFX_DAT;
146       int version2 = CodecUtil.checkIndexHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
147       if (version != version2) {
148         throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, vectorsStream);
149       }
150       assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
151       
152       long pos = vectorsStream.getFilePointer();
153       
154       if (version >= VERSION_CHUNK_STATS) {
155         vectorsStream.seek(maxPointer);
156         numChunks = vectorsStream.readVLong();
157         numDirtyChunks = vectorsStream.readVLong();
158         if (numDirtyChunks > numChunks) {
159           throw new CorruptIndexException("invalid chunk counts: dirty=" + numDirtyChunks + ", total=" + numChunks, vectorsStream);
160         }
161       } else {
162         numChunks = numDirtyChunks = -1;
163       }
164       
165       // NOTE: data file is too costly to verify checksum against all the bytes on open,
166       // but for now we at least verify proper structure of the checksum footer: which looks
167       // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
168       // such as file truncation.
169       CodecUtil.retrieveChecksum(vectorsStream);
170       vectorsStream.seek(pos);
171 
172       packedIntsVersion = vectorsStream.readVInt();
173       chunkSize = vectorsStream.readVInt();
174       decompressor = compressionMode.newDecompressor();
175       this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
176 
177       success = true;
178     } finally {
179       if (!success) {
180         IOUtils.closeWhileHandlingException(this);
181       }
182     }
183   }
184 
185   CompressionMode getCompressionMode() {
186     return compressionMode;
187   }
188 
189   int getChunkSize() {
190     return chunkSize;
191   }
192 
193   int getPackedIntsVersion() {
194     return packedIntsVersion;
195   }
196   
197   int getVersion() {
198     return version;
199   }
200 
201   CompressingStoredFieldsIndexReader getIndexReader() {
202     return indexReader;
203   }
204 
205   IndexInput getVectorsStream() {
206     return vectorsStream;
207   }
208   
209   long getMaxPointer() {
210     return maxPointer;
211   }
212   
213   long getNumChunks() {
214     return numChunks;
215   }
216   
217   long getNumDirtyChunks() {
218     return numDirtyChunks;
219   }
220 
221   /**
222    * @throws AlreadyClosedException if this TermVectorsReader is closed
223    */
224   private void ensureOpen() throws AlreadyClosedException {
225     if (closed) {
226       throw new AlreadyClosedException("this FieldsReader is closed");
227     }
228   }
229 
230   @Override
231   public void close() throws IOException {
232     if (!closed) {
233       IOUtils.close(vectorsStream);
234       closed = true;
235     }
236   }
237 
238   @Override
239   public TermVectorsReader clone() {
240     return new CompressingTermVectorsReader(this);
241   }
242 
243   @Override
244   public Fields get(int doc) throws IOException {
245     ensureOpen();
246 
247     // seek to the right place
248     {
249       final long startPointer = indexReader.getStartPointer(doc);
250       vectorsStream.seek(startPointer);
251     }
252 
253     // decode
254     // - docBase: first doc ID of the chunk
255     // - chunkDocs: number of docs of the chunk
256     final int docBase = vectorsStream.readVInt();
257     final int chunkDocs = vectorsStream.readVInt();
258     if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
259       throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
260     }
261 
262     final int skip; // number of fields to skip
263     final int numFields; // number of fields of the document we're looking for
264     final int totalFields; // total number of fields of the chunk (sum for all docs)
265     if (chunkDocs == 1) {
266       skip = 0;
267       numFields = totalFields = vectorsStream.readVInt();
268     } else {
269       reader.reset(vectorsStream, chunkDocs);
270       int sum = 0;
271       for (int i = docBase; i < doc; ++i) {
272         sum += reader.next();
273       }
274       skip = sum;
275       numFields = (int) reader.next();
276       sum += numFields;
277       for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
278         sum += reader.next();
279       }
280       totalFields = sum;
281     }
282 
283     if (numFields == 0) {
284       // no vectors
285       return null;
286     }
287 
288     // read field numbers that have term vectors
289     final int[] fieldNums;
290     {
291       final int token = vectorsStream.readByte() & 0xFF;
292       assert token != 0; // means no term vectors, cannot happen since we checked for numFields == 0
293       final int bitsPerFieldNum = token & 0x1F;
294       int totalDistinctFields = token >>> 5;
295       if (totalDistinctFields == 0x07) {
296         totalDistinctFields += vectorsStream.readVInt();
297       }
298       ++totalDistinctFields;
299       final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
300       fieldNums = new int[totalDistinctFields];
301       for (int i = 0; i < totalDistinctFields; ++i) {
302         fieldNums[i] = (int) it.next();
303       }
304     }
305 
306     // read field numbers and flags
307     final int[] fieldNumOffs = new int[numFields];
308     final PackedInts.Reader flags;
309     {
310       final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
311       final PackedInts.Reader allFieldNumOffs = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
312       switch (vectorsStream.readVInt()) {
313         case 0:
314           final PackedInts.Reader fieldFlags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, fieldNums.length, FLAGS_BITS);
315           PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
316           for (int i = 0; i < totalFields; ++i) {
317             final int fieldNumOff = (int) allFieldNumOffs.get(i);
318             assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
319             final int fgs = (int) fieldFlags.get(fieldNumOff);
320             f.set(i, fgs);
321           }
322           flags = f;
323           break;
324         case 1:
325           flags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, FLAGS_BITS);
326           break;
327         default:
328           throw new AssertionError();
329       }
330       for (int i = 0; i < numFields; ++i) {
331         fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
332       }
333     }
334 
335     // number of terms per field for all fields
336     final PackedInts.Reader numTerms;
337     final int totalTerms;
338     {
339       final int bitsRequired = vectorsStream.readVInt();
340       numTerms = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
341       int sum = 0;
342       for (int i = 0; i < totalFields; ++i) {
343         sum += numTerms.get(i);
344       }
345       totalTerms = sum;
346     }
347 
348     // term lengths
349     int docOff = 0, docLen = 0, totalLen;
350     final int[] fieldLengths = new int[numFields];
351     final int[][] prefixLengths = new int[numFields][];
352     final int[][] suffixLengths = new int[numFields][];
353     {
354       reader.reset(vectorsStream, totalTerms);
355       // skip
356       int toSkip = 0;
357       for (int i = 0; i < skip; ++i) {
358         toSkip += numTerms.get(i);
359       }
360       reader.skip(toSkip);
361       // read prefix lengths
362       for (int i = 0; i < numFields; ++i) {
363         final int termCount = (int) numTerms.get(skip + i);
364         final int[] fieldPrefixLengths = new int[termCount];
365         prefixLengths[i] = fieldPrefixLengths;
366         for (int j = 0; j < termCount; ) {
367           final LongsRef next = reader.next(termCount - j);
368           for (int k = 0; k < next.length; ++k) {
369             fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
370           }
371         }
372       }
373       reader.skip(totalTerms - reader.ord());
374 
375       reader.reset(vectorsStream, totalTerms);
376       // skip
377       toSkip = 0;
378       for (int i = 0; i < skip; ++i) {
379         for (int j = 0; j < numTerms.get(i); ++j) {
380           docOff += reader.next();
381         }
382       }
383       for (int i = 0; i < numFields; ++i) {
384         final int termCount = (int) numTerms.get(skip + i);
385         final int[] fieldSuffixLengths = new int[termCount];
386         suffixLengths[i] = fieldSuffixLengths;
387         for (int j = 0; j < termCount; ) {
388           final LongsRef next = reader.next(termCount - j);
389           for (int k = 0; k < next.length; ++k) {
390             fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
391           }
392         }
393         fieldLengths[i] = sum(suffixLengths[i]);
394         docLen += fieldLengths[i];
395       }
396       totalLen = docOff + docLen;
397       for (int i = skip + numFields; i < totalFields; ++i) {
398         for (int j = 0; j < numTerms.get(i); ++j) {
399           totalLen += reader.next();
400         }
401       }
402     }
403 
404     // term freqs
405     final int[] termFreqs = new int[totalTerms];
406     {
407       reader.reset(vectorsStream, totalTerms);
408       for (int i = 0; i < totalTerms; ) {
409         final LongsRef next = reader.next(totalTerms - i);
410         for (int k = 0; k < next.length; ++k) {
411           termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
412         }
413       }
414     }
415 
416     // total number of positions, offsets and payloads
417     int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
418     for (int i = 0, termIndex = 0; i < totalFields; ++i) {
419       final int f = (int) flags.get(i);
420       final int termCount = (int) numTerms.get(i);
421       for (int j = 0; j < termCount; ++j) {
422         final int freq = termFreqs[termIndex++];
423         if ((f & POSITIONS) != 0) {
424           totalPositions += freq;
425         }
426         if ((f & OFFSETS) != 0) {
427           totalOffsets += freq;
428         }
429         if ((f & PAYLOADS) != 0) {
430           totalPayloads += freq;
431         }
432       }
433       assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
434     }
435 
436     final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
437     final int[][] positions, startOffsets, lengths;
438     if (totalPositions > 0) {
439       positions = readPositions(skip, numFields, flags, numTerms, termFreqs, POSITIONS, totalPositions, positionIndex);
440     } else {
441       positions = new int[numFields][];
442     }
443 
444     if (totalOffsets > 0) {
445       // average number of chars per term
446       final float[] charsPerTerm = new float[fieldNums.length];
447       for (int i = 0; i < charsPerTerm.length; ++i) {
448         charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
449       }
450       startOffsets = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
451       lengths = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
452 
453       for (int i = 0; i < numFields; ++i) {
454         final int[] fStartOffsets = startOffsets[i];
455         final int[] fPositions = positions[i];
456         // patch offsets from positions
457         if (fStartOffsets != null && fPositions != null) {
458           final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
459           for (int j = 0; j < startOffsets[i].length; ++j) {
460             fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
461           }
462         }
463         if (fStartOffsets != null) {
464           final int[] fPrefixLengths = prefixLengths[i];
465           final int[] fSuffixLengths = suffixLengths[i];
466           final int[] fLengths = lengths[i];
467           for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
468             // delta-decode start offsets and  patch lengths using term lengths
469             final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
470             lengths[i][positionIndex[i][j]] += termLength;
471             for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
472               fStartOffsets[k] += fStartOffsets[k - 1];
473               fLengths[k] += termLength;
474             }
475           }
476         }
477       }
478     } else {
479       startOffsets = lengths = new int[numFields][];
480     }
481     if (totalPositions > 0) {
482       // delta-decode positions
483       for (int i = 0; i < numFields; ++i) {
484         final int[] fPositions = positions[i];
485         final int[] fpositionIndex = positionIndex[i];
486         if (fPositions != null) {
487           for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
488             // delta-decode start offsets
489             for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
490               fPositions[k] += fPositions[k - 1];
491             }
492           }
493         }
494       }
495     }
496 
497     // payload lengths
498     final int[][] payloadIndex = new int[numFields][];
499     int totalPayloadLength = 0;
500     int payloadOff = 0;
501     int payloadLen = 0;
502     if (totalPayloads > 0) {
503       reader.reset(vectorsStream, totalPayloads);
504       // skip
505       int termIndex = 0;
506       for (int i = 0; i < skip; ++i) {
507         final int f = (int) flags.get(i);
508         final int termCount = (int) numTerms.get(i);
509         if ((f & PAYLOADS) != 0) {
510           for (int j = 0; j < termCount; ++j) {
511             final int freq = termFreqs[termIndex + j];
512             for (int k = 0; k < freq; ++k) {
513               final int l = (int) reader.next();
514               payloadOff += l;
515             }
516           }
517         }
518         termIndex += termCount;
519       }
520       totalPayloadLength = payloadOff;
521       // read doc payload lengths
522       for (int i = 0; i < numFields; ++i) {
523         final int f = (int) flags.get(skip + i);
524         final int termCount = (int) numTerms.get(skip + i);
525         if ((f & PAYLOADS) != 0) {
526           final int totalFreq = positionIndex[i][termCount];
527           payloadIndex[i] = new int[totalFreq + 1];
528           int posIdx = 0;
529           payloadIndex[i][posIdx] = payloadLen;
530           for (int j = 0; j < termCount; ++j) {
531             final int freq = termFreqs[termIndex + j];
532             for (int k = 0; k < freq; ++k) {
533               final int payloadLength = (int) reader.next();
534               payloadLen += payloadLength;
535               payloadIndex[i][posIdx+1] = payloadLen;
536               ++posIdx;
537             }
538           }
539           assert posIdx == totalFreq;
540         }
541         termIndex += termCount;
542       }
543       totalPayloadLength += payloadLen;
544       for (int i = skip + numFields; i < totalFields; ++i) {
545         final int f = (int) flags.get(i);
546         final int termCount = (int) numTerms.get(i);
547         if ((f & PAYLOADS) != 0) {
548           for (int j = 0; j < termCount; ++j) {
549             final int freq = termFreqs[termIndex + j];
550             for (int k = 0; k < freq; ++k) {
551               totalPayloadLength += reader.next();
552             }
553           }
554         }
555         termIndex += termCount;
556       }
557       assert termIndex == totalTerms : termIndex + " " + totalTerms;
558     }
559 
560     // decompress data
561     final BytesRef suffixBytes = new BytesRef();
562     decompressor.decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
563     suffixBytes.length = docLen;
564     final BytesRef payloadBytes = new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
565 
566     final int[] fieldFlags = new int[numFields];
567     for (int i = 0; i < numFields; ++i) {
568       fieldFlags[i] = (int) flags.get(skip + i);
569     }
570 
571     final int[] fieldNumTerms = new int[numFields];
572     for (int i = 0; i < numFields; ++i) {
573       fieldNumTerms[i] = (int) numTerms.get(skip + i);
574     }
575 
576     final int[][] fieldTermFreqs = new int[numFields][];
577     {
578       int termIdx = 0;
579       for (int i = 0; i < skip; ++i) {
580         termIdx += numTerms.get(i);
581       }
582       for (int i = 0; i < numFields; ++i) {
583         final int termCount = (int) numTerms.get(skip + i);
584         fieldTermFreqs[i] = new int[termCount];
585         for (int j = 0; j < termCount; ++j) {
586           fieldTermFreqs[i][j] = termFreqs[termIdx++];
587         }
588       }
589     }
590 
591     assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
592 
593     return new TVFields(fieldNums, fieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths,
594         prefixLengths, suffixLengths, fieldTermFreqs,
595         positionIndex, positions, startOffsets, lengths,
596         payloadBytes, payloadIndex,
597         suffixBytes);
598   }
599 
600   // field -> term index -> position index
601   private int[][] positionIndex(int skip, int numFields, PackedInts.Reader numTerms, int[] termFreqs) {
602     final int[][] positionIndex = new int[numFields][];
603     int termIndex = 0;
604     for (int i = 0; i < skip; ++i) {
605       final int termCount = (int) numTerms.get(i);
606       termIndex += termCount;
607     }
608     for (int i = 0; i < numFields; ++i) {
609       final int termCount = (int) numTerms.get(skip + i);
610       positionIndex[i] = new int[termCount + 1];
611       for (int j = 0; j < termCount; ++j) {
612         final int freq = termFreqs[termIndex+j];
613         positionIndex[i][j + 1] = positionIndex[i][j] + freq;
614       }
615       termIndex += termCount;
616     }
617     return positionIndex;
618   }
619 
620   private int[][] readPositions(int skip, int numFields, PackedInts.Reader flags, PackedInts.Reader numTerms, int[] termFreqs, int flag, final int totalPositions, int[][] positionIndex) throws IOException {
621     final int[][] positions = new int[numFields][];
622     reader.reset(vectorsStream, totalPositions);
623     // skip
624     int toSkip = 0;
625     int termIndex = 0;
626     for (int i = 0; i < skip; ++i) {
627       final int f = (int) flags.get(i);
628       final int termCount = (int) numTerms.get(i);
629       if ((f & flag) != 0) {
630         for (int j = 0; j < termCount; ++j) {
631           final int freq = termFreqs[termIndex+j];
632           toSkip += freq;
633         }
634       }
635       termIndex += termCount;
636     }
637     reader.skip(toSkip);
638     // read doc positions
639     for (int i = 0; i < numFields; ++i) {
640       final int f = (int) flags.get(skip + i);
641       final int termCount = (int) numTerms.get(skip + i);
642       if ((f & flag) != 0) {
643         final int totalFreq = positionIndex[i][termCount];
644         final int[] fieldPositions = new int[totalFreq];
645         positions[i] = fieldPositions;
646         for (int j = 0; j < totalFreq; ) {
647           final LongsRef nextPositions = reader.next(totalFreq - j);
648           for (int k = 0; k < nextPositions.length; ++k) {
649             fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k];
650           }
651         }
652       }
653       termIndex += termCount;
654     }
655     reader.skip(totalPositions - reader.ord());
656     return positions;
657   }
658 
659   private class TVFields extends Fields {
660 
661     private final int[] fieldNums, fieldFlags, fieldNumOffs, numTerms, fieldLengths;
662     private final int[][] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
663     private final BytesRef suffixBytes, payloadBytes;
664 
665     public TVFields(int[] fieldNums, int[] fieldFlags, int[] fieldNumOffs, int[] numTerms, int[] fieldLengths,
666         int[][] prefixLengths, int[][] suffixLengths, int[][] termFreqs,
667         int[][] positionIndex, int[][] positions, int[][] startOffsets, int[][] lengths,
668         BytesRef payloadBytes, int[][] payloadIndex,
669         BytesRef suffixBytes) {
670       this.fieldNums = fieldNums;
671       this.fieldFlags = fieldFlags;
672       this.fieldNumOffs = fieldNumOffs;
673       this.numTerms = numTerms;
674       this.fieldLengths = fieldLengths;
675       this.prefixLengths = prefixLengths;
676       this.suffixLengths = suffixLengths;
677       this.termFreqs = termFreqs;
678       this.positionIndex = positionIndex;
679       this.positions = positions;
680       this.startOffsets = startOffsets;
681       this.lengths = lengths;
682       this.payloadBytes = payloadBytes;
683       this.payloadIndex = payloadIndex;
684       this.suffixBytes = suffixBytes;
685     }
686 
687     @Override
688     public Iterator<String> iterator() {
689       return new Iterator<String>() {
690         int i = 0;
691         @Override
692         public boolean hasNext() {
693           return i < fieldNumOffs.length;
694         }
695         @Override
696         public String next() {
697           if (!hasNext()) {
698             throw new NoSuchElementException();
699           }
700           final int fieldNum = fieldNums[fieldNumOffs[i++]];
701           return fieldInfos.fieldInfo(fieldNum).name;
702         }
703         @Override
704         public void remove() {
705           throw new UnsupportedOperationException();
706         }
707       };
708     }
709 
710     @Override
711     public Terms terms(String field) throws IOException {
712       final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
713       if (fieldInfo == null) {
714         return null;
715       }
716       int idx = -1;
717       for (int i = 0; i < fieldNumOffs.length; ++i) {
718         if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) {
719           idx = i;
720           break;
721         }
722       }
723 
724       if (idx == -1 || numTerms[idx] == 0) {
725         // no term
726         return null;
727       }
728       int fieldOff = 0, fieldLen = -1;
729       for (int i = 0; i < fieldNumOffs.length; ++i) {
730         if (i < idx) {
731           fieldOff += fieldLengths[i];
732         } else {
733           fieldLen = fieldLengths[i];
734           break;
735         }
736       }
737       assert fieldLen >= 0;
738       return new TVTerms(numTerms[idx], fieldFlags[idx],
739           prefixLengths[idx], suffixLengths[idx], termFreqs[idx],
740           positionIndex[idx], positions[idx], startOffsets[idx], lengths[idx],
741           payloadIndex[idx], payloadBytes,
742           new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen));
743     }
744 
745     @Override
746     public int size() {
747       return fieldNumOffs.length;
748     }
749 
750   }
751 
752   private class TVTerms extends Terms {
753 
754     private final int numTerms, flags;
755     private final int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
756     private final BytesRef termBytes, payloadBytes;
757 
758     TVTerms(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs,
759         int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
760         int[] payloadIndex, BytesRef payloadBytes,
761         BytesRef termBytes) {
762       this.numTerms = numTerms;
763       this.flags = flags;
764       this.prefixLengths = prefixLengths;
765       this.suffixLengths = suffixLengths;
766       this.termFreqs = termFreqs;
767       this.positionIndex = positionIndex;
768       this.positions = positions;
769       this.startOffsets = startOffsets;
770       this.lengths = lengths;
771       this.payloadIndex = payloadIndex;
772       this.payloadBytes = payloadBytes;
773       this.termBytes = termBytes;
774     }
775 
776     @Override
777     public TermsEnum iterator() throws IOException {
778       TVTermsEnum termsEnum = new TVTermsEnum();
779       termsEnum.reset(numTerms, flags, prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths,
780           payloadIndex, payloadBytes,
781           new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
782       return termsEnum;
783     }
784 
785     @Override
786     public long size() throws IOException {
787       return numTerms;
788     }
789 
790     @Override
791     public long getSumTotalTermFreq() throws IOException {
792       return -1L;
793     }
794 
795     @Override
796     public long getSumDocFreq() throws IOException {
797       return numTerms;
798     }
799 
800     @Override
801     public int getDocCount() throws IOException {
802       return 1;
803     }
804 
805     @Override
806     public boolean hasFreqs() {
807       return true;
808     }
809 
810     @Override
811     public boolean hasOffsets() {
812       return (flags & OFFSETS) != 0;
813     }
814 
815     @Override
816     public boolean hasPositions() {
817       return (flags & POSITIONS) != 0;
818     }
819 
820     @Override
821     public boolean hasPayloads() {
822       return (flags & PAYLOADS) != 0;
823     }
824 
825   }
826 
827   private static class TVTermsEnum extends TermsEnum {
828 
829     private int numTerms, startPos, ord;
830     private int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
831     private ByteArrayDataInput in;
832     private BytesRef payloads;
833     private final BytesRef term;
834 
835     private TVTermsEnum() {
836       term = new BytesRef(16);
837     }
838 
839     void reset(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs, int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
840         int[] payloadIndex, BytesRef payloads, ByteArrayDataInput in) {
841       this.numTerms = numTerms;
842       this.prefixLengths = prefixLengths;
843       this.suffixLengths = suffixLengths;
844       this.termFreqs = termFreqs;
845       this.positionIndex = positionIndex;
846       this.positions = positions;
847       this.startOffsets = startOffsets;
848       this.lengths = lengths;
849       this.payloadIndex = payloadIndex;
850       this.payloads = payloads;
851       this.in = in;
852       startPos = in.getPosition();
853       reset();
854     }
855 
856     void reset() {
857       term.length = 0;
858       in.setPosition(startPos);
859       ord = -1;
860     }
861 
862     @Override
863     public BytesRef next() throws IOException {
864       if (ord == numTerms - 1) {
865         return null;
866       } else {
867         assert ord < numTerms;
868         ++ord;
869       }
870 
871       // read term
872       term.offset = 0;
873       term.length = prefixLengths[ord] + suffixLengths[ord];
874       if (term.length > term.bytes.length) {
875         term.bytes = ArrayUtil.grow(term.bytes, term.length);
876       }
877       in.readBytes(term.bytes, prefixLengths[ord], suffixLengths[ord]);
878 
879       return term;
880     }
881 
882     @Override
883     public SeekStatus seekCeil(BytesRef text)
884         throws IOException {
885       if (ord < numTerms && ord >= 0) {
886         final int cmp = term().compareTo(text);
887         if (cmp == 0) {
888           return SeekStatus.FOUND;
889         } else if (cmp > 0) {
890           reset();
891         }
892       }
893       // linear scan
894       while (true) {
895         final BytesRef term = next();
896         if (term == null) {
897           return SeekStatus.END;
898         }
899         final int cmp = term.compareTo(text);
900         if (cmp > 0) {
901           return SeekStatus.NOT_FOUND;
902         } else if (cmp == 0) {
903           return SeekStatus.FOUND;
904         }
905       }
906     }
907 
908     @Override
909     public void seekExact(long ord) throws IOException {
910       throw new UnsupportedOperationException();
911     }
912 
913     @Override
914     public BytesRef term() throws IOException {
915       return term;
916     }
917 
918     @Override
919     public long ord() throws IOException {
920       throw new UnsupportedOperationException();
921     }
922 
923     @Override
924     public int docFreq() throws IOException {
925       return 1;
926     }
927 
928     @Override
929     public long totalTermFreq() throws IOException {
930       return termFreqs[ord];
931     }
932 
933     @Override
934     public final PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
935       if (PostingsEnum.featureRequested(flags, DocsAndPositionsEnum.OLD_NULL_SEMANTICS)) {
936         if (positions == null && startOffsets == null) {
937           // Positions nor offsets were indexed:
938           return null;
939         }
940       }
941       
942       final TVPostingsEnum docsEnum;
943       if (reuse != null && reuse instanceof TVPostingsEnum) {
944         docsEnum = (TVPostingsEnum) reuse;
945       } else {
946         docsEnum = new TVPostingsEnum();
947       }
948 
949       docsEnum.reset(termFreqs[ord], positionIndex[ord], positions, startOffsets, lengths, payloads, payloadIndex);
950       return docsEnum;
951     }
952 
953   }
954 
955   private static class TVPostingsEnum extends PostingsEnum {
956 
957     private int doc = -1;
958     private int termFreq;
959     private int positionIndex;
960     private int[] positions;
961     private int[] startOffsets;
962     private int[] lengths;
963     private final BytesRef payload;
964     private int[] payloadIndex;
965     private int basePayloadOffset;
966     private int i;
967 
968     TVPostingsEnum() {
969       payload = new BytesRef();
970     }
971 
972     public void reset(int freq, int positionIndex, int[] positions,
973         int[] startOffsets, int[] lengths, BytesRef payloads,
974         int[] payloadIndex) {
975       this.termFreq = freq;
976       this.positionIndex = positionIndex;
977       this.positions = positions;
978       this.startOffsets = startOffsets;
979       this.lengths = lengths;
980       this.basePayloadOffset = payloads.offset;
981       this.payload.bytes = payloads.bytes;
982       payload.offset = payload.length = 0;
983       this.payloadIndex = payloadIndex;
984 
985       doc = i = -1;
986     }
987 
988     private void checkDoc() {
989       if (doc == NO_MORE_DOCS) {
990         throw new IllegalStateException("DocsEnum exhausted");
991       } else if (doc == -1) {
992         throw new IllegalStateException("DocsEnum not started");
993       }
994     }
995 
996     private void checkPosition() {
997       checkDoc();
998       if (i < 0) {
999         throw new IllegalStateException("Position enum not started");
1000       } else if (i >= termFreq) {
1001         throw new IllegalStateException("Read past last position");
1002       }
1003     }
1004 
1005     @Override
1006     public int nextPosition() throws IOException {
1007       if (doc != 0) {
1008         throw new IllegalStateException();
1009       } else if (i >= termFreq - 1) {
1010         throw new IllegalStateException("Read past last position");
1011       }
1012 
1013       ++i;
1014 
1015       if (payloadIndex != null) {
1016         payload.offset = basePayloadOffset + payloadIndex[positionIndex + i];
1017         payload.length = payloadIndex[positionIndex + i + 1] - payloadIndex[positionIndex + i];
1018       }
1019 
1020       if (positions == null) {
1021         return -1;
1022       } else {
1023         return positions[positionIndex + i];
1024       }
1025     }
1026 
1027     @Override
1028     public int startOffset() throws IOException {
1029       checkPosition();
1030       if (startOffsets == null) {
1031         return -1;
1032       } else {
1033         return startOffsets[positionIndex + i];
1034       }
1035     }
1036 
1037     @Override
1038     public int endOffset() throws IOException {
1039       checkPosition();
1040       if (startOffsets == null) {
1041         return -1;
1042       } else {
1043         return startOffsets[positionIndex + i] + lengths[positionIndex + i];
1044       }
1045     }
1046 
1047     @Override
1048     public BytesRef getPayload() throws IOException {
1049       checkPosition();
1050       if (payloadIndex == null || payload.length == 0) {
1051         return null;
1052       } else {
1053         return payload;
1054       }
1055     }
1056 
1057     @Override
1058     public int freq() throws IOException {
1059       checkDoc();
1060       return termFreq;
1061     }
1062 
1063     @Override
1064     public int docID() {
1065       return doc;
1066     }
1067 
1068     @Override
1069     public int nextDoc() throws IOException {
1070       if (doc == -1) {
1071         return (doc = 0);
1072       } else {
1073         return (doc = NO_MORE_DOCS);
1074       }
1075     }
1076 
1077     @Override
1078     public int advance(int target) throws IOException {
1079       return slowAdvance(target);
1080     }
1081 
1082     @Override
1083     public long cost() {
1084       return 1;
1085     }
1086   }
1087 
1088   private static int sum(int[] arr) {
1089     int sum = 0;
1090     for (int el : arr) {
1091       sum += el;
1092     }
1093     return sum;
1094   }
1095 
1096   @Override
1097   public long ramBytesUsed() {
1098     return indexReader.ramBytesUsed();
1099   }
1100   
1101   @Override
1102   public Collection<Accountable> getChildResources() {
1103     return Collections.singleton(Accountables.namedAccountable("term vector index", indexReader));
1104   }
1105   
1106   @Override
1107   public void checkIntegrity() throws IOException {
1108     CodecUtil.checksumEntireFile(vectorsStream);
1109   }
1110 
1111   @Override
1112   public String toString() {
1113     return getClass().getSimpleName() + "(mode=" + compressionMode + ",chunksize=" + chunkSize + ")";
1114   }
1115 }